{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ef8e9cdc-dfd4-4e54-a332-4b9bde4e6047",
   "metadata": {},
   "outputs": [],
   "source": [
    "from openai import OpenAI\n",
    "\n",
    "client = OpenAI(\n",
    "    base_url='http://localhost:11434/v1/',\n",
    "    api_key='ollama',\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "2c05052f-a85a-4137-8398-0fd0be678599",
   "metadata": {},
   "outputs": [],
   "source": [
    "from elasticsearch import Elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a78df1cc-5a5a-40b4-b673-19c7f0319453",
   "metadata": {},
   "outputs": [],
   "source": [
    "es_client = Elasticsearch('http://localhost:9200') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c9367c18-41ad-495e-9920-1a0c552f0d18",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index_settings = {\n",
    "    \"settings\": {\n",
    "        \"number_of_shards\": 1,\n",
    "        \"number_of_replicas\": 0\n",
    "    },\n",
    "    \"mappings\": {\n",
    "        \"properties\": {\n",
    "            \"text\": {\"type\": \"text\"},\n",
    "            \"section\": {\"type\": \"text\"},\n",
    "            \"question\": {\"type\": \"text\"},\n",
    "            \"course\": {\"type\": \"keyword\"} \n",
    "        }\n",
    "    }\n",
    "}\n",
    "\n",
    "index_name = \"course-questions\"\n",
    "\n",
    "es_client.indices.create(index=index_name, body=index_settings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e131e3a3-4051-4fd7-8e4d-d17c2af2ad75",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests \n",
    "\n",
    "docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'\n",
    "docs_response = requests.get(docs_url)\n",
    "documents_raw = docs_response.json()\n",
    "\n",
    "documents = []\n",
    "\n",
    "for course in documents_raw:\n",
    "    course_name = course['course']\n",
    "\n",
    "    for doc in course['documents']:\n",
    "        doc['course'] = course_name\n",
    "        documents.append(doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5c230059-e219-4a13-a7f8-ede4cf1b028f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm.auto import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "70fe3c97-916d-42c0-bd7b-4f42d9056409",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d74c0925b2eb48b8b301b1f418b6938a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/948 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for doc in tqdm(documents):\n",
    "    es_client.index(index=index_name, document=doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "7c72e000-910b-4fb5-aa88-2561e7bc39f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def elastic_search(query):\n",
    "    search_query = {\n",
    "        \"size\": 5,\n",
    "        \"query\": {\n",
    "            \"bool\": {\n",
    "                \"must\": {\n",
    "                    \"multi_match\": {\n",
    "                        \"query\": query,\n",
    "                        \"fields\": [\"question^3\", \"text\", \"section\"],\n",
    "                        \"type\": \"best_fields\"\n",
    "                    }\n",
    "                },\n",
    "                \"filter\": {\n",
    "                    \"term\": {\n",
    "                        \"course\": \"data-engineering-zoomcamp\"\n",
    "                    }\n",
    "                }\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "\n",
    "    response = es_client.search(index=index_name, body=search_query)\n",
    "    \n",
    "    result_docs = []\n",
    "    \n",
    "    for hit in response['hits']['hits']:\n",
    "        result_docs.append(hit['_source'])\n",
    "    \n",
    "    return result_docs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "bdc51143-f861-46e2-8a8f-e5c6a324f53b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_prompt(query, search_results):\n",
    "    prompt_template = \"\"\"\n",
    "You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n",
    "Use only the facts from the CONTEXT when answering the QUESTION.\n",
    "\n",
    "QUESTION: {question}\n",
    "\n",
    "CONTEXT: \n",
    "{context}\n",
    "\"\"\".strip()\n",
    "\n",
    "    context = \"\"\n",
    "    \n",
    "    for doc in search_results:\n",
    "        context = context + f\"section: {doc['section']}\\nquestion: {doc['question']}\\nanswer: {doc['text']}\\n\\n\"\n",
    "    \n",
    "    prompt = prompt_template.format(question=query, context=context).strip()\n",
    "    return prompt\n",
    "\n",
    "def llm(prompt):\n",
    "    response = client.chat.completions.create(\n",
    "        model='phi3',\n",
    "        messages=[{\"role\": \"user\", \"content\": prompt}]\n",
    "    )\n",
    "    \n",
    "    return response.choices[0].message.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "81abecbc-eb6b-428f-ab7d-7e21f58b64de",
   "metadata": {},
   "outputs": [],
   "source": [
    "def rag(query):\n",
    "    search_results = elastic_search(query)\n",
    "    prompt = build_prompt(query, search_results)\n",
    "    answer = llm(prompt)\n",
    "    return answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "0ea9315a-a619-4066-9e90-8c260f2c8450",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' Yes, you can still join the course even if you discover it after the start date. There will be deadlines for turning in final projects, but materials and support are available for those who start later. Additionally, we keep all the materials after the course finishes, so you can follow the course at your own pace.'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "query = 'I just disovered the course. Can I still join it?'\n",
    "rag(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8095274-c9cd-4fd5-80d2-069fc951834f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
